Racing Bar Chart!!!

For #tidytuesday we’re looking at America’s National Parks! I have already blogged about how much I love the Nation Park System, so this is perfect. My goal is to create a racing bar chart to see which National Park sites have been the most popular since 1904.

Libraries

if (!require(pacman)) {install.packages('pacman')} 
p_load(
    gganimate
  , janitor
  , skimr
  , stringr
  , tidyverse
  , lubridate
  )

Import Data

Instead of using the #TidyTuesday data set file, since I have already been exploring the National Park Visitor data, I will use the data I’ve already gathered. The full public data can be found (here)https://bit.ly/2kDr64y. I also have imported a color palette based on the National Park passports. The passports break the country down into color coded regions. I plan to apply the regional colors to each park in the racing bar chart.

# NPS Visitors
nps_summary_raw <- read.csv("./data/annual_summary_report_1904-2018.csv",
                        stringsAsFactors = FALSE)

# NPS Region Color Table
nps_region_colors <- read.csv("./data/nps_region_colors.csv",
                              stringsAsFactors = FALSE)

# Park / Region Definitions
nps_park_region_list <- read.csv("./data/nps_parks_top_25.csv",
                              stringsAsFactors = FALSE)

Data Prep

AThe only important data is the park name, the year, and the number of visitors. So the data set can be greatly pared down. The park name should be treated as a factor. Also, as a side note, I like a consistent case and labeling structure, so I use the janitor package to clean the variable names

nps_summary <- nps_summary_raw %>% 
  clean_names() %>% 
  select(park_name, year, recreation_visitors) %>% 
  mutate(park_name = as.factor(park_name))

#glimpse(nps_summary)
#skim(nps_summary)

The racing bar chart will show the top park sites for each year. To visualize this I need to rank each park by population for each year. I will then filter the data by rank in order to only show the number of sites I decide to use.

#Number of parks to show on the bar chart
number_of_parks <- 15

#Create annual rank and filter by number of parks
nps_summary_rank <- nps_summary %>% 
  group_by(year) %>% 
  mutate(rank = as.integer(rank(-recreation_visitors))) %>% 
  ungroup() %>% 
  filter(rank <= number_of_parks) %>% 
  left_join(nps_park_region_list, by = "park_name") %>% 
  left_join(select(nps_region_colors, nps_region:nps_region_names), by = "nps_region" ) %>% 
  mutate(nps_region_names = as.factor(nps_region_names))
## Warning: Column `park_name` joining factor and character vector, coercing
## into character vector

In order to use the color palette in ggplot I need to develop a color map.

#Assign Colors to NPS Regions
color_map <- setNames(nps_region_colors$html_color_code_stamps,
                      nps_region_colors$nps_region_names)
# Generate list of top n ranked parks
nps_summary_rank_park_list <- nps_summary_rank %>% 
  distinct(park_name)

## Export Top N list for generating a cross reference of parks and regions
# write.csv(x = nps_summary_rank_park_list,
#           file = "./data/nps_parks_top_n.csv", 
#           row.names = FALSE)

Racing Bar Chart

To create a racing bar chart with R I will use gganimate. First I like to set up a function for the theme and other design parameters.

# Theme Parameters
theme_racing_bar <- function(){
  theme_minimal() +
    theme(
      axis.title = element_blank(),
      axis.text = element_blank(),
      panel.grid = element_blank(),
      legend.position = "bottom",
      legend.text = element_text(size = 16),
      legend.title = element_text(size = 20),
      legend.spacing.y = unit(20, "cm"),
      plot.margin = margin(2,2,2,2,"cm"),
      plot.title = element_text(
        hjust = 0.5, 
        face = "bold",
        size = 40)
    )
}

With that established the animation can be created.

# Create gganimate object
nps_racing_bar <- 
  ggplot(
    data = nps_summary_rank,
    mapping = aes(
      x = -rank,
      y = recreation_visitors,
      group = park_name
         )
    ) +
  geom_tile(
    mapping = aes(
      y = recreation_visitors/2,
      height = recreation_visitors, 
      width = 0.75,
      fill = nps_region_names
    )
  ) +
  geom_text(
    aes(label = scales::comma(recreation_visitors)),
    hjust = "left",
    size = 4,
    fontface = "bold",
    nudge_y = 1e5, 
    colour = "grey30"
    )+
  geom_text(
    aes(label = park_name),
    colour = "black", 
    fontface = "bold",
    size = 4,
    hjust = "right",
    nudge_y = -1e5
    )+
  geom_text(
    aes(
      x = -13,
      y = 14e6,
      label = paste0(year)
      ),
    size = 20
    ) +
  scale_y_continuous(
    labels = scales::comma,
    limits = c(-5.5e6,22e6)
    ) +
  scale_x_continuous(
    limits = c(-16,0)
    ) +
  scale_fill_manual(
    values = color_map, 
    drop = FALSE
    ) +
  guides(fill = guide_legend(nrow=3)) +
  coord_flip(clip = "off") + 
  ylab(label = "Visitors") +
  labs(title = "Most Visited National Park Sites",
       caption = "Data provided by US National Park Service",
       fill = "NPS Region") +
  theme_racing_bar() +
  transition_time(time = year) +
  ease_aes('cubic-in-out')

# Animate Plot
nps_racing_bar_gif <- animate(
  plot = nps_racing_bar,
  nframes = 2850,
  fps = 50,
  end_pause = 50,
  width = 960,
  height = 600
)

#nps_racing_bar_gif

#nps_racing_bar

Save Animation

# anim_save(filename = "nps_racing_bar.gif", 
#           animation = nps_racing_bar_gif, 
#           path = "./data/")